%matplotlib inline
import warnings
import seaborn as sns
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report,roc_auc_score
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from scipy.stats import iqr
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
data = pd.read_csv("vehicle.csv")
print('\n\nshape of data:', data.shape)
print('\n\nCount of target variable:\n\n', data['class'].value_counts(),'\n\nData types of attributes:\n\n')
print(data.info())
print('\n\n First five rows of data:\n\n', data.head())
print('\n\n Few statistical values of attributes:\n\n', data.describe())
En = LabelEncoder()
columns = data.columns
data['class'] = En.fit_transform(vehdf['class'])
print(data['class'].head())
newdata = data.copy()
print(pd.DataFrame(newdata.isnull().sum(), columns= ['Number of missing values']))
X = newdata.iloc[:,0:19]
imputer = SimpleImputer(missing_values=np.nan, strategy='median', verbose=1)
transformed_values = imputer.fit_transform(X)
column = X.columns
newdata = pd.DataFrame(transformed_values, columns = column)
print(pd.DataFrame(newdata.isnull().sum(), columns= ['Number of missing values']))
plt.figure(figsize= (20,15))
plt.subplot(9,3,1)
sns.boxplot(x= newdata['pr.axis_aspect_ratio'])
plt.subplot(9,3,2)
sns.boxplot(x= newdata.skewness_about)
plt.subplot(9,3,3)
sns.boxplot(x= newdata.scaled_variance)
plt.figure(figsize= (20,15))
plt.subplot(9,3,4)
sns.boxplot(x= newdata['radius_ratio'])
plt.subplot(9,3,5)
sns.boxplot(x= newdata['scaled_radius_of_gyration.1'])
plt.subplot(9,3,6)
sns.boxplot(x= newdata['scaled_variance.1'])
plt.figure(figsize= (20,15))
plt.subplot(9,3,7)
sns.boxplot(x= newdata['max.length_aspect_ratio'])
plt.subplot(9,3,8)
sns.boxplot(x= newdata['skewness_about.1'])
plt.show()
Q1 = newdata.quantile(0.25)
Q3 = newdata.quantile(0.75)
IQR = Q3 - Q1
newdata2 = newdata[~((newdata < (Q1 - 1.5 * IQR)) |(newdata > (Q3 + 1.5 * IQR))).any(axis=1)]
print('\n\n After removal of outliers:\n\n')
plt.figure(figsize= (20,15))
plt.subplot(9,3,1)
sns.boxplot(x= newdata2['pr.axis_aspect_ratio'])
plt.subplot(9,3,2)
sns.boxplot(x= newdata2.skewness_about)
plt.subplot(9,3,3)
sns.boxplot(x= newdata2.scaled_variance)
plt.figure(figsize= (20,15))
plt.subplot(9,3,4)
sns.boxplot(x= newdata2['radius_ratio'])
plt.subplot(9,3,5)
sns.boxplot(x= newdata2['scaled_radius_of_gyration.1'])
plt.subplot(9,3,6)
sns.boxplot(x= newdata2['scaled_variance.1'])
plt.figure(figsize= (20,15))
plt.subplot(9,3,7)
sns.boxplot(x= newdata2['max.length_aspect_ratio'])
plt.subplot(9,3,8)
sns.boxplot(x= newdata2['skewness_about.1'])
plt.show()
newdata2.hist(bins=20, figsize=(60,40))
plt.show()
skewness = newdata2.skew()
print("skewValue of dataframe attributes: \n", skewness)
f, ax = plt.subplots(1, 5, figsize=(30,5))
vis1 = sns.distplot(newdata2["scaled_variance.1"],bins=10, ax= ax[0])
vis2 = sns.distplot(newdata2["scaled_variance"],bins=10, ax=ax[1])
vis3 = sns.distplot(newdata2["skewness_about.1"],bins=10, ax= ax[2])
vis4 = sns.distplot(newdata2["skewness_about"],bins=10, ax=ax[3])
vis6 = sns.distplot(newdata2["scatter_ratio"],bins=10, ax=ax[4])
plt.show()
def correlation_heatmap(dataframe,l,w):
correlation = dataframe.corr()
plt.figure(figsize=(l,w))
sns.heatmap(correlation, vmax=1, square=True,annot=True,cmap='viridis')
plt.show();
corr= newdata2.drop('class', axis=1)
correlation_heatmap(corr, 30,15)
sns.pairplot(newdata3, diag_kind="kde")
X = newdata2.iloc[:,0:18].values
y = newdata2.iloc[:,18].values
S = StandardScaler()
X_scaled = S.fit_transform(X)
O_X_train,O_X_test,O_y_train,O_y_test = train_test_split(X_scaled,y,test_size=0.30,random_state=1)
print("{0:0.2f}% data is in training set".format((len(O_X_train)/len(data.index)) * 100))
print("{0:0.2f}% data is in test set".format((len(O_X_test)/len(data.index)) * 100))
svc = SVC()
svc.fit(O_X_train,O_y_train)
O_y_predict = svc.predict(O_X_test)
print("Model Score On Original Data ",svc.score(O_X_test, O_y_test)*100, '%')
print('\n\nConfusion matrix of Original Data:\n', confusion_matrix(O_y_test, O_y_predict ))
kfold = KFold(n_splits=10, random_state=1)
results1 = cross_val_score(svc, X_scaled, y, cv=kfold)
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results1.mean()*100.0, results1.std()*100.0))
cov_matrix = np.cov(X_scaled.T)
print("\n\nCovariance_matrix shape:",cov_matrix.shape)
print("\n\nCovariance_matrix:\n\n",cov_matrix)
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
print('\n\nEigen Vectors:\n\n', eigenvectors)
print('\n Eigen Values: \n\n', eigenvalues)
eig_pairs = [(eigenvalues[index], eigenvectors[:,index]) for index in range(len(eigenvalues))]
eig_pairs.sort()
eig_pairs.reverse()
print(eig_pairs)
eigvalues_sorted = [eig_pairs[index][0] for index in range(len(eigenvalues))]
eigvectors_sorted = [eig_pairs[index][1] for index in range(len(eigenvalues))]
print('\nEigenvalues in descending order:\n\n', eigvalues_sorted)
summation = sum(eigenvalues)
variance = [(i/summation) for i in sorted(eigenvalues, reverse=True)] # an array of variance explained by each
# eigen vector... there will be 18 entries as there are 18 eigen vectors)
cum_variance=np.cumsum(variance) # an array of cumulative variance. There will be 18 entries with 18 th entry
# cumulative reaching almost 100%
plt.bar(range(1,19), variance, alpha=0.5, align='center', label='individual explained variance')
plt.step(range(1,19),cum_variance, where= 'mid', label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc = 'best')
plt.show()
P_reduce = np.array(eigvectors_sorted[0:8])
X_std_8D = np.dot(X_scaled,P_reduce.T)
reduced_pca = pd.DataFrame(X_std_8D)
reduced_pca
sns.pairplot(reduced_pca, diag_kind='kde')
pca_X_train,pca_X_test,pca_y_train,pca_y_test = train_test_split(reduced_pca,y,test_size=0.30,random_state=1) print("{0:0.2f}% data is in training set".format((len(pca_X_train)/len(data.index)) 100)) print("{0:0.2f}% data is in test set".format((len(pca_X_test)/len(data.index)) 100))
svc1 = SVC()
svc1.fit(pca_X_train,pca_y_train)
pca_y_predict = svc1.predict(pca_X_test)
print("Model Score On reduced Data ",svc1.score(pca_X_test, pca_y_test)*100, '%')
print('\n\nConfusion matrix of Original Data:\n', confusion_matrix(pca_y_test, pca_y_predict ))
kfold = KFold(n_splits=10, random_state=1)
results2 = cross_val_score(svc1,reduced_pca, y, cv=kfold)
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results2.mean()*100.0, results2.std()*100.0))
print("Classification Report For Raw Data:", "\n", classification_report(O_y_test,O_y_predict))
print("Classification Report For PCA:","\n", classification_report(pca_y_test,pca_y_predict))
print("\nCross Validation Accuracy of Raw data: %.3f%% (%.3f%%)" % (results1.mean()*100.0, results1.std()*100.0))
print("\nCross Validation Accuracy of PCA data: %.3f%% (%.3f%%)" % (results2.mean()*100.0, results2.std()*100.0))